top_coef_plot <- function(final_model, model_name, ds_name, enet=FALSE, reactome=FALSE, reactome_info){
  # compute coefficients from model
  if (enet){
    coefficients <- as.data.frame(as.matrix(coef(model_ls$model_fit$finalModel, (model_ls$model_fit$bestTune$lambda))))
  }
  coefficients = coef(final_model)
  sum.coef = sum(sapply(coefficients, abs))
  coefficients = coefficients * 100 / sum.coef
  coefficients = sort(coefficients[, 1 , 1])
  
  # Create top coef data frame
  rc_pos <- data.frame(c(head(coefficients, 5)))
  colnames(rc_pos) <- c('pos_pred')
  rc_pos$ID <- rownames(rc_pos)
  rc_pos <- reshape2::melt(rc_pos)
  rc_neg <- data.frame(c(tail(coefficients, 5)))
  colnames(rc_neg) <- c('neg_pred')
  rc_neg$ID <- rownames(rc_neg)
  rc_neg <- reshape2::melt(rc_neg)
  rc <- rbind(rc_pos, rc_neg)
  print(rc$ID[grep('reactome', rc$ID)])
  if(reactome) {
    reactome_rows <- grep('reactome', rc$ID)
    k <- which(reactome_info$path_ID %in% gsub('rna_', '', rc$ID[reactome_rows]))
    rc$ID[reactome_rows] <- reactome_info$path_names[k]
  }
  
  # Plot
  p <- ggplot(data=rc, aes(reorder(ID, value), value, fill=variable)) +
    geom_bar(stat="identity") +
    theme_minimal() +
    ggtitle(paste(model_name, ' model with ', ds_name)) +
    xlab("Variable") + 
    ylab("Regression coefficient") +
    coord_flip() +
    theme(legend.position="none", plot.title = element_text(hjust = 0.5))
  
  p
}

ftimp_gg <- function(model_ls, n_topft = 10, model_name, ds_name, reactome=FALSE, reactome_info){
    require('ggplot2')
    require('iml')
    
    # Feature Importance
    ftimp_rf <- caret::varImp(model_ls$model_fit)
    ftimp_rf <- ftimp_rf$importance %>% 
      dplyr::arrange(desc(.)) %>%
      dplyr::top_n(n_topft) %>%
      mutate(Features = rownames(.))
    top_fts <- ftimp_rf$Features
    if(reactome) {
      reactome_rows <- grep('reactome', ftimp_rf$Features)
      k <- which(reactome_info$path_ID %in% gsub('rna_', '', ftimp_rf$Features[reactome_rows]))
      ftimp_rf$Features[reactome_rows] <- reactome_info$path_names[k]
    }
    print(ggplot(ftimp_rf, aes(x=reorder(Features, Overall), y=Overall)) +
        geom_bar(stat="identity", fill='steelblue') +
        labs(x ="Features", y = "Importance %")+
        coord_flip() +
        theme_minimal() +
        ggtitle(paste(model_name, ' model with ', ds_name)) +
        theme(legend.position="none", plot.title = element_text(hjust = 0.5)))
    
    # Feature effects
    X = model_ls$TrainSet[which(names(model_ls$TrainSet) != "SILA")]
    y = model_ls$TrainSet$SILA
    predictor <- Predictor$new(model_ls$model_fit, data = X, y = y)
    
    print(plot(FeatureEffects$new(predictor, features = top_fts)))
}

frac_dif <- function(model_ls){
    frac_dif <- ((model_ls$pred/model_ls$TestSet$SILA)-1)*100
}
err_df <- data.frame('Model'=NA, 'RMSE'=NA, 'MAE'=NA)
reactome_info <- read.csv('/Users/senosam/Documents/Massion_lab/data_integration/models/data/rna_reactomeinfo.csv', row.names = 1)

Model based on CyTOF frequencies

ds_name = 'CyTOF frequencies'
model_ds <- models_by_ds$cytof_freq

PLS

Model metrics

model_ls <- model_ds$model_PLS
model_name = 'PLS'
cat('RMSE test: ', model_ls$rmse_test, '\n')
## RMSE test:  0.1472328
cat('MAE test: ', MAE(model_ls$pred, model_ls$TestSet$SILA), '\n')
## MAE test:  0.1394869
model_ls$model_fit
## Partial Least Squares 
## 
## 34 samples
## 39 predictors
## 
## Pre-processing: centered (39), scaled (39) 
## Resampling: Cross-Validated (5 fold, repeated 10 times) 
## Summary of sample sizes: 28, 28, 27, 26, 27, 29, ... 
## Resampling results across tuning parameters:
## 
##   ncomp  RMSE       Rsquared   MAE      
##   1      0.1956908  0.1483690  0.1563369
##   2      0.2302795  0.1324251  0.1852570
##   3      0.2351941  0.1341671  0.1906296
##   4      0.2751348  0.1689894  0.2165516
##   5      0.3261610  0.1985935  0.2504491
## 
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was ncomp = 1.
plot(model_ls$model_fit)

err_df[1,] <- c(paste0(ds_name, ': ', model_name), model_ls$rmse_test, MAE(model_ls$pred, model_ls$TestSet$SILA))

Top regression coefficients

The regression coefficients are normalized so their absolute sum is 100 and the result is sorted.

top_coef_plot(model_ls$model_fit$finalModel, model_name, ds_name)
## Using ID as id variables
## Using ID as id variables
## character(0)

Random Forest

Model metrics

model_ls <- model_ds$model_RF
model_name = 'RF'
cat('RMSE test: ', model_ls$rmse_test, '\n')
## RMSE test:  0.138978
cat('MAE test: ', MAE(model_ls$pred, model_ls$TestSet$SILA), '\n')
## MAE test:  0.1320495
model_ls$model_fit
## Random Forest 
## 
## 34 samples
## 39 predictors
## 
## Pre-processing: centered (39), scaled (39) 
## Resampling: Cross-Validated (5 fold, repeated 10 times) 
## Summary of sample sizes: 28, 28, 27, 26, 27, 29, ... 
## Resampling results across tuning parameters:
## 
##   mtry  splitrule   RMSE       Rsquared   MAE      
##    2    variance    0.1753103  0.1700511  0.1477809
##    2    extratrees  0.1699220  0.1646694  0.1421207
##   11    variance    0.1768414  0.1525850  0.1497036
##   11    extratrees  0.1716088  0.1296286  0.1447778
##   20    variance    0.1768067  0.1442666  0.1492415
##   20    extratrees  0.1717616  0.1124466  0.1451755
##   29    variance    0.1770195  0.1438997  0.1496551
##   29    extratrees  0.1721370  0.1219529  0.1454077
##   39    variance    0.1769284  0.1383713  0.1494189
##   39    extratrees  0.1728073  0.1142058  0.1459092
## 
## Tuning parameter 'min.node.size' was held constant at a value of 5
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were mtry = 2, splitrule = extratrees
##  and min.node.size = 5.
plot(model_ls$model_fit)

err_df[2,] <- c(paste0(ds_name, ': ', model_name), model_ls$rmse_test, MAE(model_ls$pred, model_ls$TestSet$SILA))

Feature Importance

ftimp_gg(model_ls, model_name=model_name, ds_name=ds_name)
## Loading required package: iml
## Selecting by Overall
## Warning: UNRELIABLE VALUE: Future ('future_lapply-1') unexpectedly generated
## random numbers without specifying argument 'future.seed'. There is a risk that
## those random numbers are not statistically sound and the overall results might
## be invalid. To fix this, specify 'future.seed=TRUE'. This ensures that proper,
## parallel-safe random numbers are produced via the L'Ecuyer-CMRG method. To
## disable this check, use 'future.seed=NULL', or set option 'future.rng.onMisuse'
## to "ignore".

Model based on CyTOF bulk expression

ds_name = 'CyTOF bulk expression'
model_ds <- models_by_ds$cytof_medianprot

PLS

Model metrics

model_ls <- model_ds$model_PLS
model_name = 'PLS'
cat('RMSE test: ', model_ls$rmse_test, '\n')
## RMSE test:  0.1758155
cat('MAE test: ', MAE(model_ls$pred, model_ls$TestSet$SILA), '\n')
## MAE test:  0.1420185
model_ls$model_fit
## Partial Least Squares 
## 
## 34 samples
## 34 predictors
## 
## Pre-processing: centered (34), scaled (34) 
## Resampling: Cross-Validated (5 fold, repeated 10 times) 
## Summary of sample sizes: 28, 28, 27, 26, 27, 29, ... 
## Resampling results across tuning parameters:
## 
##   ncomp  RMSE       Rsquared   MAE      
##   1      0.1859154  0.1526097  0.1513773
##   2      0.2150547  0.2081635  0.1757809
##   3      0.2326452  0.2062358  0.1846154
##   4      0.2898711  0.2588294  0.2225834
##   5      0.3474757  0.2764505  0.2637017
## 
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was ncomp = 1.
plot(model_ls$model_fit)

err_df[3,] <- c(paste0(ds_name, ': ', model_name), model_ls$rmse_test, MAE(model_ls$pred, model_ls$TestSet$SILA))

Top regression coefficients

The regression coefficients are normalized so their absolute sum is 100 and the result is sorted.

top_coef_plot(model_ls$model_fit$finalModel, model_name, ds_name)
## Using ID as id variables
## Using ID as id variables
## character(0)

Random Forest

Model metrics

model_ls <- model_ds$model_RF
model_name = 'RF'
cat('RMSE test: ', model_ls$rmse_test, '\n')
## RMSE test:  0.1487374
cat('MAE test: ', MAE(model_ls$pred, model_ls$TestSet$SILA), '\n')
## MAE test:  0.134784
model_ls$model_fit
## Random Forest 
## 
## 34 samples
## 34 predictors
## 
## Pre-processing: centered (34), scaled (34) 
## Resampling: Cross-Validated (5 fold, repeated 10 times) 
## Summary of sample sizes: 28, 28, 27, 26, 27, 29, ... 
## Resampling results across tuning parameters:
## 
##   mtry  splitrule   RMSE       Rsquared   MAE      
##    2    variance    0.1687446  0.1612905  0.1431078
##    2    extratrees  0.1678348  0.1246672  0.1419484
##   10    variance    0.1740050  0.1427438  0.1450840
##   10    extratrees  0.1712549  0.1224263  0.1444594
##   18    variance    0.1761271  0.1427009  0.1463892
##   18    extratrees  0.1719575  0.1240482  0.1446421
##   26    variance    0.1780171  0.1360151  0.1478463
##   26    extratrees  0.1728129  0.1434599  0.1454456
##   34    variance    0.1792265  0.1342647  0.1485463
##   34    extratrees  0.1737264  0.1351547  0.1461453
## 
## Tuning parameter 'min.node.size' was held constant at a value of 5
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were mtry = 2, splitrule = extratrees
##  and min.node.size = 5.
plot(model_ls$model_fit)

err_df[4,] <- c(paste0(ds_name, ': ', model_name), model_ls$rmse_test, MAE(model_ls$pred, model_ls$TestSet$SILA))

Feature Importance

ftimp_gg(model_ls, model_name=model_name, ds_name=ds_name)
## Selecting by Overall
## Warning: UNRELIABLE VALUE: Future ('future_lapply-1') unexpectedly generated
## random numbers without specifying argument 'future.seed'. There is a risk that
## those random numbers are not statistically sound and the overall results might
## be invalid. To fix this, specify 'future.seed=TRUE'. This ensures that proper,
## parallel-safe random numbers are produced via the L'Ecuyer-CMRG method. To
## disable this check, use 'future.seed=NULL', or set option 'future.rng.onMisuse'
## to "ignore".

Model based on RNA Seq REACTOME scores

ds_name = 'RNA Seq REACTOME scores'
model_ds <- models_by_ds$rna_reactome

PLS

Model metrics

model_ls <- model_ds$model_PLS
model_name = 'PLS'
cat('RMSE test: ', model_ls$rmse_test, '\n')
## RMSE test:  0.1474013
cat('MAE test: ', MAE(model_ls$pred, model_ls$TestSet$SILA), '\n')
## MAE test:  0.1333214
model_ls$model_fit
## Partial Least Squares 
## 
##   34 samples
## 1838 predictors
## 
## Pre-processing: centered (1838), scaled (1838) 
## Resampling: Cross-Validated (5 fold, repeated 10 times) 
## Summary of sample sizes: 28, 28, 27, 26, 27, 29, ... 
## Resampling results across tuning parameters:
## 
##   ncomp  RMSE       Rsquared   MAE      
##   1      0.1739414  0.1421956  0.1405954
##   2      0.1803449  0.1893151  0.1425230
##   3      0.2004195  0.1374260  0.1614717
##   4      0.2023606  0.1386374  0.1609985
##   5      0.2015930  0.1528425  0.1600130
## 
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was ncomp = 1.
plot(model_ls$model_fit)

err_df[5,] <- c(paste0(ds_name, ': ', model_name), model_ls$rmse_test, MAE(model_ls$pred, model_ls$TestSet$SILA))

Top regression coefficients

The regression coefficients are normalized so their absolute sum is 100 and the result is sorted.

top_coef_plot(model_ls$model_fit$finalModel, model_name, ds_name, reactome = TRUE, reactome_info = reactome_info)
## Using ID as id variables
## Using ID as id variables
##  [1] "reactome_1693" "reactome_1527" "reactome_803"  "reactome_558" 
##  [5] "reactome_1143" "reactome_1495" "reactome_1374" "reactome_979" 
##  [9] "reactome_1525" "reactome_1077"

Random Forest

Model metrics

model_ls <- model_ds$model_RF
model_name = 'RF'
cat('RMSE test: ', model_ls$rmse_test, '\n')
## RMSE test:  0.1507297
cat('MAE test: ', MAE(model_ls$pred, model_ls$TestSet$SILA), '\n')
## MAE test:  0.1414097
model_ls$model_fit
## Random Forest 
## 
##   34 samples
## 1838 predictors
## 
## Pre-processing: centered (1838), scaled (1838) 
## Resampling: Cross-Validated (5 fold, repeated 10 times) 
## Summary of sample sizes: 28, 28, 27, 26, 27, 29, ... 
## Resampling results across tuning parameters:
## 
##   mtry  splitrule   RMSE       Rsquared   MAE      
##      2  variance    0.1618905  0.1546546  0.1358869
##      2  extratrees  0.1605926  0.1699014  0.1339465
##     11  variance    0.1650863  0.1485923  0.1380268
##     11  extratrees  0.1626671  0.1505702  0.1352888
##     60  variance    0.1666801  0.1774619  0.1384627
##     60  extratrees  0.1646094  0.1670552  0.1372455
##    333  variance    0.1684221  0.1757368  0.1394101
##    333  extratrees  0.1662657  0.1799412  0.1384324
##   1838  variance    0.1711252  0.1676104  0.1410888
##   1838  extratrees  0.1669614  0.1802420  0.1383647
## 
## Tuning parameter 'min.node.size' was held constant at a value of 5
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were mtry = 2, splitrule = extratrees
##  and min.node.size = 5.
plot(model_ls$model_fit)

err_df[6,] <- c(paste0(ds_name, ': ', model_name), model_ls$rmse_test, MAE(model_ls$pred, model_ls$TestSet$SILA))

Feature Importance

ftimp_gg(model_ls, model_name=model_name, ds_name=ds_name, reactome=TRUE, reactome_info=reactome_info)
## Selecting by Overall
## Warning: UNRELIABLE VALUE: Future ('future_lapply-1') unexpectedly generated
## random numbers without specifying argument 'future.seed'. There is a risk that
## those random numbers are not statistically sound and the overall results might
## be invalid. To fix this, specify 'future.seed=TRUE'. This ensures that proper,
## parallel-safe random numbers are produced via the L'Ecuyer-CMRG method. To
## disable this check, use 'future.seed=NULL', or set option 'future.rng.onMisuse'
## to "ignore".

Model based on RNA Seq VIPER TF activity

ds_name = 'RNA Seq VIPER TF activity'
model_ds <- models_by_ds$rna_viper4

PLS

Model metrics

model_ls <- model_ds$model_PLS
model_name = 'PLS'
cat('RMSE test: ', model_ls$rmse_test, '\n')
## RMSE test:  0.1463339
cat('MAE test: ', MAE(model_ls$pred, model_ls$TestSet$SILA), '\n')
## MAE test:  0.130622
model_ls$model_fit
## Partial Least Squares 
## 
##  34 samples
## 118 predictors
## 
## Pre-processing: centered (118), scaled (118) 
## Resampling: Cross-Validated (5 fold, repeated 10 times) 
## Summary of sample sizes: 28, 28, 27, 26, 27, 29, ... 
## Resampling results across tuning parameters:
## 
##   ncomp  RMSE       Rsquared   MAE      
##   1      0.1730487  0.1608107  0.1386264
##   2      0.2212056  0.1263251  0.1817183
##   3      0.2210563  0.1134532  0.1803047
##   4      0.2232691  0.1298120  0.1812632
##   5      0.2264314  0.1316729  0.1802890
## 
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was ncomp = 1.
plot(model_ls$model_fit)

err_df[7,] <- c(paste0(ds_name, ': ', model_name), model_ls$rmse_test, MAE(model_ls$pred, model_ls$TestSet$SILA))

Top regression coefficients

The regression coefficients are normalized so their absolute sum is 100 and the result is sorted.

top_coef_plot(model_ls$model_fit$finalModel, model_name, ds_name)
## Using ID as id variables
## Using ID as id variables
## character(0)

Random Forest

Model metrics

model_ls <- model_ds$model_RF
model_name = 'RF'
cat('RMSE test: ', model_ls$rmse_test, '\n')
## RMSE test:  0.1424572
cat('MAE test: ', MAE(model_ls$pred, model_ls$TestSet$SILA), '\n')
## MAE test:  0.1334272
model_ls$model_fit
## Random Forest 
## 
##  34 samples
## 118 predictors
## 
## Pre-processing: centered (118), scaled (118) 
## Resampling: Cross-Validated (5 fold, repeated 10 times) 
## Summary of sample sizes: 28, 28, 27, 26, 27, 29, ... 
## Resampling results across tuning parameters:
## 
##   mtry  splitrule   RMSE       Rsquared   MAE      
##     2   variance    0.1640078  0.1543702  0.1354459
##     2   extratrees  0.1622847  0.1548528  0.1335238
##    31   variance    0.1657500  0.1768016  0.1354061
##    31   extratrees  0.1652939  0.1537369  0.1361517
##    60   variance    0.1656851  0.1703241  0.1350524
##    60   extratrees  0.1644074  0.1720150  0.1351572
##    89   variance    0.1652619  0.1926147  0.1349360
##    89   extratrees  0.1645177  0.1821421  0.1353334
##   118   variance    0.1651127  0.1891581  0.1342650
##   118   extratrees  0.1648454  0.1794311  0.1349291
## 
## Tuning parameter 'min.node.size' was held constant at a value of 5
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were mtry = 2, splitrule = extratrees
##  and min.node.size = 5.
plot(model_ls$model_fit)

err_df[8,] <- c(paste0(ds_name, ': ', model_name), model_ls$rmse_test, MAE(model_ls$pred, model_ls$TestSet$SILA))

Feature Importance

ftimp_gg(model_ls, model_name=model_name, ds_name=ds_name)
## Selecting by Overall
## Warning: UNRELIABLE VALUE: Future ('future_lapply-1') unexpectedly generated
## random numbers without specifying argument 'future.seed'. There is a risk that
## those random numbers are not statistically sound and the overall results might
## be invalid. To fix this, specify 'future.seed=TRUE'. This ensures that proper,
## parallel-safe random numbers are produced via the L'Ecuyer-CMRG method. To
## disable this check, use 'future.seed=NULL', or set option 'future.rng.onMisuse'
## to "ignore".

Model based on WES mut

ds_name = 'WES mutation'
model_ds <- models_by_ds$wes_binary

PLS

Model metrics

model_ls <- model_ds$model_PLS
model_name = 'PLS'
cat('RMSE test: ', model_ls$rmse_test, '\n')
## RMSE test:  0.154442
cat('MAE test: ', MAE(model_ls$pred, model_ls$TestSet$SILA), '\n')
## MAE test:  0.1396076
model_ls$model_fit
## Partial Least Squares 
## 
##  34 samples
## 169 predictors
## 
## Pre-processing: centered (169), scaled (169) 
## Resampling: Cross-Validated (5 fold, repeated 10 times) 
## Summary of sample sizes: 28, 28, 27, 26, 27, 29, ... 
## Resampling results across tuning parameters:
## 
##   ncomp  RMSE       Rsquared   MAE      
##   1      0.1907319  0.1875453  0.1558135
##   2      0.1825731  0.1722682  0.1476898
##   3      0.1881260  0.1785514  0.1509093
##   4      0.1949080  0.1641259  0.1555165
##   5      0.2018338  0.1496834  0.1600058
## 
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was ncomp = 2.
plot(model_ls$model_fit)

err_df[9,] <- c(paste0(ds_name, ': ', model_name), model_ls$rmse_test, MAE(model_ls$pred, model_ls$TestSet$SILA))

Top regression coefficients

The regression coefficients are normalized so their absolute sum is 100 and the result is sorted.

top_coef_plot(model_ls$model_fit$finalModel, model_name, ds_name)
## Using ID as id variables
## Using ID as id variables
## character(0)

Random Forest

Model metrics

model_ls <- model_ds$model_RF
model_name = 'RF'
cat('RMSE test: ', model_ls$rmse_test, '\n')
## RMSE test:  0.1393409
cat('MAE test: ', MAE(model_ls$pred, model_ls$TestSet$SILA), '\n')
## MAE test:  0.1278576
model_ls$model_fit
## Random Forest 
## 
##  34 samples
## 169 predictors
## 
## Pre-processing: centered (169), scaled (169) 
## Resampling: Cross-Validated (5 fold, repeated 10 times) 
## Summary of sample sizes: 28, 28, 27, 26, 27, 29, ... 
## Resampling results across tuning parameters:
## 
##   mtry  splitrule   RMSE       Rsquared   MAE      
##     2   variance    0.1659472  0.1558093  0.1398201
##     2   extratrees  0.1660705  0.1563622  0.1396717
##    43   variance    0.1710692  0.1540223  0.1468873
##    43   extratrees  0.1711343  0.1549991  0.1472228
##    85   variance    0.1741523  0.1647829  0.1509367
##    85   extratrees  0.1746352  0.1736047  0.1511754
##   127   variance    0.1771702  0.1751928  0.1538010
##   127   extratrees  0.1773455  0.1772299  0.1539032
##   169   variance    0.1796438  0.1772324  0.1555598
##   169   extratrees  0.1793553  0.1822027  0.1555469
## 
## Tuning parameter 'min.node.size' was held constant at a value of 5
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were mtry = 2, splitrule = variance
##  and min.node.size = 5.
plot(model_ls$model_fit)

err_df[10,] <- c(paste0(ds_name, ': ', model_name), model_ls$rmse_test, MAE(model_ls$pred, model_ls$TestSet$SILA))

Feature Importance

ftimp_gg(model_ls, model_name=model_name, ds_name=ds_name)
## Selecting by Overall
## Warning: UNRELIABLE VALUE: Future ('future_lapply-1') unexpectedly generated
## random numbers without specifying argument 'future.seed'. There is a risk that
## those random numbers are not statistically sound and the overall results might
## be invalid. To fix this, specify 'future.seed=TRUE'. This ensures that proper,
## parallel-safe random numbers are produced via the L'Ecuyer-CMRG method. To
## disable this check, use 'future.seed=NULL', or set option 'future.rng.onMisuse'
## to "ignore".

Model based on Concatenated Data

ds_name = 'Concatenated Data'
load('/Users/senosam/Documents/Massion_lab/data_integration/models/models_res/models_CONCAT_ls.RData')

PLS

Model metrics

model_ls <- models_CONCAT_ls$model_PLS
model_name = 'PLS'
cat('RMSE test: ', model_ls$rmse_test, '\n')
## RMSE test:  0.1476102
cat('MAE test: ', MAE(model_ls$pred, model_ls$TestSet$SILA), '\n')
## MAE test:  0.1328039
model_ls$model_fit
## Partial Least Squares 
## 
##   34 samples
## 2198 predictors
## 
## Pre-processing: centered (2198), scaled (2198) 
## Resampling: Cross-Validated (5 fold, repeated 10 times) 
## Summary of sample sizes: 28, 28, 27, 26, 27, 29, ... 
## Resampling results across tuning parameters:
## 
##   ncomp  RMSE       Rsquared   MAE      
##   1      0.1761748  0.1328592  0.1422889
##   2      0.1836616  0.1718236  0.1448280
##   3      0.1996644  0.1216580  0.1592344
##   4      0.1967117  0.1226756  0.1540897
##   5      0.1937543  0.1253861  0.1502014
## 
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was ncomp = 1.
plot(model_ls$model_fit)

err_df[11,] <- c(paste0(ds_name, ': ', model_name), model_ls$rmse_test, MAE(model_ls$pred, model_ls$TestSet$SILA))

Top regression coefficients

The regression coefficients are normalized so their absolute sum is 100 and the result is sorted.

# , reactome = TRUE, reactome_info = reactome_info
top_coef_plot(model_ls$model_fit$finalModel, model_name, ds_name, reactome = TRUE, reactome_info = reactome_info)
## Using ID as id variables
## Using ID as id variables
## [1] "rna_reactome_1693" "rna_reactome_1527" "rna_reactome_803" 
## [4] "rna_reactome_1495" "rna_reactome_1374" "rna_reactome_979" 
## [7] "rna_reactome_1525" "rna_reactome_1077"

Random Forest

Model metrics

model_ls <- models_CONCAT_ls$model_RF
model_name = 'RF'
cat('RMSE test: ', model_ls$rmse_test, '\n')
## RMSE test:  0.1468419
cat('MAE test: ', MAE(model_ls$pred, model_ls$TestSet$SILA), '\n')
## MAE test:  0.1382979
model_ls$model_fit
## Random Forest 
## 
##   34 samples
## 2198 predictors
## 
## Pre-processing: centered (2198), scaled (2198) 
## Resampling: Cross-Validated (5 fold, repeated 10 times) 
## Summary of sample sizes: 28, 28, 27, 26, 27, 29, ... 
## Resampling results across tuning parameters:
## 
##   mtry  splitrule   RMSE       Rsquared   MAE      
##      2  variance    0.1629314  0.1360342  0.1362809
##      2  extratrees  0.1615130  0.1683173  0.1345702
##     11  variance    0.1646622  0.1563520  0.1378350
##     11  extratrees  0.1632687  0.1556167  0.1362748
##     66  variance    0.1662447  0.1686240  0.1381101
##     66  extratrees  0.1646850  0.1479140  0.1373843
##    381  variance    0.1688350  0.1588761  0.1394662
##    381  extratrees  0.1655816  0.1636630  0.1367020
##   2198  variance    0.1721210  0.1484698  0.1419690
##   2198  extratrees  0.1674119  0.1672822  0.1382225
## 
## Tuning parameter 'min.node.size' was held constant at a value of 5
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were mtry = 2, splitrule = extratrees
##  and min.node.size = 5.
plot(model_ls$model_fit)

err_df[12,] <- c(paste0(ds_name, ': ', model_name), model_ls$rmse_test, MAE(model_ls$pred, model_ls$TestSet$SILA))

Feature Importance

ftimp_gg(model_ls, model_name=model_name, ds_name=ds_name, reactome=TRUE, reactome_info=reactome_info)
## Selecting by Overall
## Warning: UNRELIABLE VALUE: Future ('future_lapply-1') unexpectedly generated
## random numbers without specifying argument 'future.seed'. There is a risk that
## those random numbers are not statistically sound and the overall results might
## be invalid. To fix this, specify 'future.seed=TRUE'. This ensures that proper,
## parallel-safe random numbers are produced via the L'Ecuyer-CMRG method. To
## disable this check, use 'future.seed=NULL', or set option 'future.rng.onMisuse'
## to "ignore".

Fractional Difference all models

fd_cytoffreq_RF <- frac_dif(models_by_ds$cytof_freq$model_RF)
fd_cytoffreq_PLS <- frac_dif(models_by_ds$cytof_freq$model_PLS)

fd_cytofmed_RF <- frac_dif(models_by_ds$cytof_medianprot$model_RF)
fd_cytofmed_PLS <- frac_dif(models_by_ds$cytof_medianprot$model_PLS)

fd_rnareac_RF <- frac_dif(models_by_ds$rna_reactome$model_RF)
fd_rnareac_PLS <- frac_dif(models_by_ds$rna_reactome$model_PLS)

fd_rnaviper_RF <- frac_dif(models_by_ds$rna_viper4$model_RF)
fd_rnaviper_PLS <- frac_dif(models_by_ds$rna_viper4$model_PLS)

fd_wes_RF <- frac_dif(models_by_ds$wes_binary$model_RF)
fd_wes_PLS <- frac_dif(models_by_ds$wes_binary$model_PLS)

fd_cc_RF <- frac_dif(models_CONCAT_ls$model_RF)
fd_cc_PLS <- frac_dif(models_CONCAT_ls$model_PLS)
## Using  as id variables

err_df$MAE <- as.numeric(err_df$MAE)
err_df$RMSE <- as.numeric(err_df$RMSE)
err_df$MAE <- round(err_df$MAE, digits = 3)
err_df$RMSE <- round(err_df$RMSE, digits = 3)
knitr::kable(err_df)
Model RMSE MAE
CyTOF frequencies: PLS 0.147 0.139
CyTOF frequencies: RF 0.139 0.132
CyTOF bulk expression: PLS 0.176 0.142
CyTOF bulk expression: RF 0.149 0.135
RNA Seq REACTOME scores: PLS 0.147 0.133
RNA Seq REACTOME scores: RF 0.151 0.141
RNA Seq VIPER TF activity: PLS 0.146 0.131
RNA Seq VIPER TF activity: RF 0.142 0.133
WES mutation: PLS 0.154 0.140
WES mutation: RF 0.139 0.128
Concatenated Data: PLS 0.148 0.133
Concatenated Data: RF 0.147 0.138

Ensemble approach: Majority vote

test_true <- traintest_info[which(traintest_info$ds=='test'), 'n_op2']

Random Forest

res <- matrix(NA, nrow = length(which(traintest_info$ds=='test')), ncol = length(names(models_by_ds)))
colnames(res) <- names(models_by_ds)
model_nm = 'model_RF'
for(ds in names(models_by_ds)){
  x <- models_by_ds[[ds]][[model_nm]]$pred
  for(i in 1:length(x)){
    if(x[i] <0.4) {
      res[i,ds] <- 'ind'
    } else if(x[i] > 0.4 && x[i] <= 0.6) {
      res[i,ds] <- 'int'
    } else {
      res[i,ds] <- 'agg'
    }
  }
}

pred_maj <- c()
for(i in 1:nrow(res)){
  x <- which.max(table(res[i,]))
  pred_maj <- c(pred_maj, names(x))
}

confusionMatrix(as.factor(pred_maj), as.factor(test_true))
##      [,1] [,2]
## [1,]    0    0
## [2,]    0    6

PLS

res <- matrix(NA, nrow = length(which(traintest_info$ds=='test')), ncol = length(names(models_by_ds)))
colnames(res) <- names(models_by_ds)
model_nm = 'model_PLS'
for(ds in names(models_by_ds)){
  x <- models_by_ds[[ds]][[model_nm]]$pred
  for(i in 1:length(x)){
    if(x[i] <0.4) {
      res[i,ds] <- 'ind'
    } else if(x[i] > 0.4 && x[i] <= 0.6) {
      res[i,ds] <- 'int'
    } else {
      res[i,ds] <- 'agg'
    }
  }
}

pred_maj <- c()
for(i in 1:nrow(res)){
  x <- which.max(table(res[i,]))
  pred_maj <- c(pred_maj, names(x))
}

confusionMatrix(as.factor(pred_maj), as.factor(test_true))
##      [,1] [,2]
## [1,]    0    0
## [2,]    0    7

Concatenation approach

Random Forest

x <- models_CONCAT_ls$model_RF$pred
res <- c()
for(i in 1:length(x)){
  if(x[i] <0.4) {
    res <- c(res,'ind')
  } else if(x[i] > 0.4 && x[i] <= 0.6) {
    res <- c(res,'int')
  } else {
    res <- c(res,'agg')
  }
}

confusionMatrix(as.factor(res), as.factor(test_true))
##      [,1] [,2]
## [1,]    0    0
## [2,]    0    4

PLS

x <- models_CONCAT_ls$model_PLS$pred
res <- c()
for(i in 1:length(x)){
  if(x[i] <0.4) {
    res <- c(res,'ind')
  } else if(x[i] > 0.4 && x[i] <= 0.6) {
    res <- c(res,'int')
  } else {
    res <- c(res,'agg')
  }
}

confusionMatrix(as.factor(res), as.factor(test_true))
##      [,1] [,2]
## [1,]    0    0
## [2,]    0    5
mmetrics <- data.frame(Approach=c('Ensemble', 'Ensemble', 'Concatenation', 'Concatenation'),
                       Algorithm=c('RF', 'PLS', 'RF', 'PLS'),
                       Accuracy=c(0.7273,0.8182,0.5455,0.6364),
                       Sensitivity=c(0.6667,0.7778,0.4444,0.5556),
                       NVP=c(0.4,0.5,0.2857,0.3333))

mmetrics <- reshape2::melt(mmetrics)
## Using Approach, Algorithm as id variables
sbst <- mmetrics[mmetrics$variable== 'Accuracy',]
ggplot(mmetrics, aes(fill=Approach, y=value, x=Algorithm)) + 
    geom_bar(position="dodge", stat="identity") +
    theme_minimal() +
    ylim(c(0,1)) +
    labs(title=NULL,x=NULL, y =NULL) +
    scale_fill_brewer(palette="Dark2")+
    facet_wrap(~variable, scales='free') +
    theme(strip.text.x = element_text(size = 18), 
          axis.text.x =element_text(size = 15), 
          axis.text.y =element_text(size = 12))